This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:
In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes
In [3]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [4]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
In [5]:
# Get features
feature_table = em.get_features_for_blocking(A, B)
In [6]:
type(feature_table)
Out[6]:
In [9]:
feature_table.head()
Out[9]:
In [11]:
# Drop first row
feature_table = feature_table.drop(0)
In [12]:
feature_table.head()
Out[12]:
In [15]:
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']
In [14]:
feature_table
Out[14]:
In [16]:
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']
In [17]:
feature_table
Out[17]: